#!/bin/bash
# 仅支持 HDFS 文件和目录
# 要复制的文件或者目录
SOURCE="/user/jonas/output"
# 目标目录
DIST="/user/jonas/output1"
# 清理上一次的输出
hdfs dfs -rm -f -R /user/jonas/distcp_output
# 删除上一次的复制
hdfs dfs -rm -f -R "$DIST"
spark-submit --master yarn \
  --class com.jonas.sparkwork.answer2.Answer2App \
  spark-work_2.12-0.1.0-SNAPSHOT.jar \
  "hdfs://$SOURCE" \
  "hdfs://$DIST" \
  4
printf "\n列出复制后用户目录下所有内容：\n\n"
hdfs dfs -ls -R /user/jonas
printf "\n列出复制输出的文件路径：\n\n"
hdfs dfs -find /user/jonas/distcp_output | grep "part-" | while read -r line;
do
  hdfs dfs -cat "$line"
done
